import os
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from wordcloud import WordCloud
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MaxAbsScaler
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import log_loss
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import joblib
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
nltk.download('stopwords')
nltk.download('punkt')
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\asus\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to [nltk_data] C:\Users\asus\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date!
True
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
label_encoder = LabelEncoder()
porter=PorterStemmer()
scaler = MaxAbsScaler()
positive_folder = 'M:/s8/NLP/project/Dataset/review_polarity/txt_sentoken/pos'
negative_folder = 'M:/s8/NLP/project/Dataset/review_polarity/txt_sentoken/neg'
texts = []
labels = []
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
def preprocess_text(text):
# Tokenization
tokens = word_tokenize(text)
# Lowercasing, removing punctuation, and removing stopwords
stop_words = set(stopwords.words('english'))
processed_tokens = [token.lower() for token in tokens if token.lower() not in stop_words and token.lower() not in string.punctuation]
# Stemming
ps = PorterStemmer()
processed_tokens = [ps.stem(token) for token in processed_tokens]
# Removing special characters, emojis, and handling contractions
processed_tokens = [re.sub(r'[^a-zA-Z0-9\s]', '', token) for token in processed_tokens]
# Handling numerical data
processed_tokens = ['NUM' if token.isdigit() else token for token in processed_tokens]
# Joining Tokens
processed_text = ' '.join(processed_tokens)
return processed_text
for filename in os.listdir(positive_folder):
with open(os.path.join(positive_folder, filename), 'r') as file:
text = file.read()
processed_text = preprocess_text(text)
texts.append(processed_text)
labels.append('pos')
for filename in os.listdir(negative_folder):
with open(os.path.join(negative_folder, filename), 'r') as file:
text = file.read()
processed_text = preprocess_text(text)
texts.append(processed_text)
labels.append('neg')
# Create DataFrame
df = pd.DataFrame({'text': texts, 'label': labels})
df.to_csv('preprocessed_data.csv', index=False)
df = pd.read_csv('preprocessed_data.csv')
df
| text | label | |
|---|---|---|
| 0 | film adapt comic book plenti success whether r... | pos |
| 1 | everi movi come along suspect studio everi ind... | pos |
| 2 | ve got mail work alot better deserv order make... | pos |
| 3 | jaw rare film grab attent show singl imag sc... | pos |
| 4 | moviemak lot like gener manag nfl team postsal... | pos |
| ... | ... | ... |
| 1995 | anyth stigmata taken warn releas similarlyth... | neg |
| 1996 | john boorman s zardoz goofi cinemat debacl f... | neg |
| 1997 | kid hall acquir tast took least season watch s... | neg |
| 1998 | time john carpent great horror director cours ... | neg |
| 1999 | two parti guy bob head haddaway s danc hit lo... | neg |
2000 rows × 2 columns
neg_df = df[df['label'] == 'neg']
neg_df
| text | label | |
|---|---|---|
| 1000 | plot two teen coupl go church parti drink driv... | neg |
| 1001 | happi bastard s quick movi review damn y2k bug... | neg |
| 1002 | movi like make jade movi viewer thank invent t... | neg |
| 1003 | quest camelot warner bro first featurelength... | neg |
| 1004 | synopsi mental unstabl man undergo psychothera... | neg |
| ... | ... | ... |
| 1995 | anyth stigmata taken warn releas similarlyth... | neg |
| 1996 | john boorman s zardoz goofi cinemat debacl f... | neg |
| 1997 | kid hall acquir tast took least season watch s... | neg |
| 1998 | time john carpent great horror director cours ... | neg |
| 1999 | two parti guy bob head haddaway s danc hit lo... | neg |
1000 rows × 2 columns
pos_df = df[df['label'] == 'pos']
pos_df
| text | label | |
|---|---|---|
| 0 | film adapt comic book plenti success whether r... | pos |
| 1 | everi movi come along suspect studio everi ind... | pos |
| 2 | ve got mail work alot better deserv order make... | pos |
| 3 | jaw rare film grab attent show singl imag sc... | pos |
| 4 | moviemak lot like gener manag nfl team postsal... | pos |
| ... | ... | ... |
| 995 | wow movi s everyth movi funni dramat interest ... | pos |
| 996 | richard gere command actor s alway great film ... | pos |
| 997 | glori star matthew broderick denzel washingto... | pos |
| 998 | steven spielberg s second epic film world war ... | pos |
| 999 | truman trueman burbank perfect name jim carr... | pos |
1000 rows × 2 columns
df2 = df.copy()
df2['text_word_count']=df2['text'].apply(lambda x:len(x.split()))
numerical_feature_cols=['text_word_count']
plt.figure(figsize=(24,6))
fig = px.histogram(df2, x='text_word_count', nbins=50, color_discrete_sequence=['#6495ED'])
fig.update_layout(title_text="Distribution of Text Word Count", title_x=0.5)
fig.show()
<Figure size 2400x600 with 0 Axes>
plt.figure(figsize=(24,6))
for i,col in enumerate(numerical_feature_cols):
plt.subplot(1,3,i+1)
sns.histplot(data=df2,x=col,hue='label',bins=50)
plt.title(f"Distribution of {col}")
plt.tight_layout()
plt.show()
#Let us first analyze the distribution of the target variable
from IPython.display import display
import matplotlib.pyplot as plt
print('\033[1mTarget Variable Distribution'.center(55))
plt.pie(df2['label'].value_counts(), labels=['Positive','Negative'], counterclock=False, shadow=True,
explode=[0,0.1], autopct='%1.1f%%', radius=1, startangle=215)
plt.show()
Target Variable Distribution
# Calculate average text sequence length
average_length = df2['text'].apply(lambda x: len(x.split())).mean()
# Plot
plt.figure(figsize=(8, 6))
plt.hist(df2['text'].apply(lambda x: len(x.split())), bins=50, color='#6495ED')
plt.axvline(x=average_length, color='red', linestyle='--', label=f'Average Length: {average_length:.2f}')
plt.xlabel('Text Sequence Length')
plt.ylabel('Frequency')
plt.title('Distribution of Text Sequence Length')
plt.legend()
plt.show()
positivedata = df2[df2['label']== 'pos']
positivedata =positivedata['text']
negdata = df2[df2['label']== 'neg']
negdata= negdata['text']
def wordcloud_draw(data, color, s):
words = ' '.join(data)
cleaned_word = " ".join([word for word in words.split() if(word!='movie' and word!='film')])
wordcloud = WordCloud(stopwords=stopwords.words('english'),background_color=color,width=2500,height=2000).generate(cleaned_word)
plt.imshow(wordcloud)
plt.title(s)
plt.axis('off')
plt.figure(figsize=[20,10])
plt.subplot(1,2,1)
wordcloud_draw(positivedata,'white','Most-common Positive words')
plt.subplot(1,2,2)
wordcloud_draw(negdata, 'white','Most-common Negative words')
plt.show()
df['label'] = label_encoder.fit_transform(df['label'])
joblib.dump(label_encoder, f'label_encoder.pkl')
# Check the mapping of classes
print("Class mapping:", label_encoder.classes_)
Class mapping: ['neg' 'pos']
df = shuffle(df, random_state=42)
df.head()
| text | label | |
|---|---|---|
| 1860 | guess wild bachelor parti gone realli bad woul... | 0 |
| 353 | abund trite recycl movi late NUM tremend deman... | 1 |
| 1333 | hotshot defens attorney kevin lomax keanu reev... | 0 |
| 905 | hedwig john cameron mitchel born boy name hans... | 1 |
| 1289 | ve heard call jaw claw s fair summat plot th... | 0 |
def tokenizer(text):
return text.split()
def tokenizer_porter(text):
return [porter.stem(word) for word in text.split()]
tfidf=TfidfVectorizer(strip_accents=None,lowercase=True,preprocessor=None,tokenizer=tokenizer_porter,use_idf=True,norm='l2',smooth_idf=True)
y=df.label.values
x=tfidf.fit_transform(df.text)
joblib.dump(tfidf, f'tfidf.pkl')
print(x)
print(y)
M:\Anaconda\Lib\site-packages\sklearn\feature_extraction\text.py:525: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
(0, 26663) 0.07467370329154899 (0, 27387) 0.0473115491728708 (0, 4305) 0.045066096646297155 (0, 25880) 0.048692631577492505 (0, 15953) 0.058642657480751655 (0, 18638) 0.03380795767847503 (0, 29453) 0.04076222683905151 (0, 8489) 0.0544389676324058 (0, 26559) 0.035660719138076766 (0, 29555) 0.10439146620243106 (0, 9338) 0.04381512611526275 (0, 3382) 0.05414231883612125 (0, 481) 0.04065619963533382 (0, 20314) 0.01992799178965676 (0, 24749) 0.04793812245881269 (0, 22410) 0.028924894569869578 (0, 4620) 0.11010609306834823 (0, 26370) 0.07870483753381535 (0, 14951) 0.05149340226537849 (0, 5512) 0.040815561521280574 (0, 9380) 0.025302241513689382 (0, 14516) 0.02365975817322956 (0, 27020) 0.026367083316675782 (0, 12551) 0.03448504536762791 (0, 28864) 0.05045528814188444 : : (1999, 22888) 0.15164922749549917 (1999, 29552) 0.10222936860227075 (1999, 29602) 0.08330362906635722 (1999, 463) 0.02592208767800416 (1999, 1174) 0.02370275174195459 (1999, 29491) 0.05995116004987929 (1999, 23150) 0.04005238040657449 (1999, 28781) 0.024636463286650003 (1999, 17998) 0.022863752201392265 (1999, 2533) 0.023024110368163108 (1999, 6461) 0.033230149361004005 (1999, 15257) 0.07780583353766761 (1999, 22517) 0.11729191601866788 (1999, 27238) 0.03354401664768899 (1999, 8309) 0.051770036725923965 (1999, 16704) 0.05596976652760347 (1999, 10501) 0.05047540717468999 (1999, 5667) 0.042323818841856826 (1999, 14838) 0.02676715948750948 (1999, 18867) 0.04157516839880564 (1999, 10009) 0.028515671615983614 (1999, 15527) 0.04138320183562037 (1999, 16823) 0.05908479751779132 (1999, 902) 0.020029668021230247 (1999, 17447) 0.10323381657754073 [0 1 0 ... 1 0 0]
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=80)
print("Shape of x_train:", x_train.shape)
print("Shape of x_test:", x_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)
Shape of x_train: (1600, 30612) Shape of x_test: (400, 30612) Shape of y_train: (1600,) Shape of y_test: (400,)
x_train = scaler.fit_transform(x_train)
# Transform the test data
x_test = scaler.transform(x_test)
joblib.dump(scaler, f'scaler.pkl')
print("Shape of x_train_scaled:", x_train.shape)
print("Shape of x_test_scaled:", x_test.shape)
Shape of x_train_scaled: (1600, 30612) Shape of x_test_scaled: (400, 30612)
accuracy = []
from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, precision_recall_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import joblib
from sklearn.metrics import classification_report
# Define the logistic regression model
train_accuracy = []
test_accuracy = []
# Define the logistic regression model
lr_model = LogisticRegression(solver='liblinear')
# Define the parameter grid for grid search
param_grid = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
# Perform grid search
grid_search = GridSearchCV(lr_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)
# Get the best model from grid search
best_lr_model = grid_search.best_estimator_
# Predictions on training data
y_train_pred_lr = best_lr_model.predict(x_train)
train_accuracy_lr = accuracy_score(y_train, y_train_pred_lr)
print("Training Accuracy of Logistic Regression Model:", train_accuracy_lr)
train_accuracy.append(train_accuracy_lr)
# Predictions on testing data
y_test_pred_lr = best_lr_model.predict(x_test)
accuracy_lr = accuracy_score(y_test, y_test_pred_lr)
print("Test Accuracy of Logistic Regression Model:", accuracy_lr)
test_accuracy.append(accuracy_lr)
accuracy.append(accuracy_lr)
print("Classification Report:")
print(classification_report(y_test, y_test_pred_lr))
# Save the trained model
joblib.dump(best_lr_model, 'best_lr_model.pkl')
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_test_pred_lr)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()
# ROC Curve
y_test_probs_lr = best_lr_model.predict_proba(x_test)[:, 1]
fpr_lr, tpr_lr, thresholds_lr = roc_curve(y_test, y_test_probs_lr)
plt.plot(fpr_lr, tpr_lr, label='Logistic Regression (AUC = {:.2f})'.format(auc(fpr_lr, tpr_lr)))
plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
# Precision-Recall Curve
precision_lr, recall_lr, thresholds_lr = precision_recall_curve(y_test, y_test_probs_lr)
plt.plot(recall_lr, precision_lr, label='Logistic Regression')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()
train_sizes, train_scores, test_scores = learning_curve(best_lr_model, x_train, y_train, cv=5, train_sizes=np.linspace(0.1, 1.0, 10), scoring='accuracy')
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
train_sizes = np.linspace(0.1, 1.0, 10)
plt.figure(figsize=(8, 6))
# Plotting Testing Accuracy
plt.plot(train_sizes, test_mean, label='Testing Accuracy', color='red')
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color='red', alpha=0.1)
plt.xlabel('Number of Training Examples')
plt.ylabel('Accuracy')
plt.title('Testing Accuracy')
plt.legend()
plt.tight_layout()
plt.show()
Training Accuracy of Logistic Regression Model: 0.999375
Test Accuracy of Logistic Regression Model: 0.8675
Classification Report:
precision recall f1-score support
0 0.88 0.86 0.87 204
1 0.86 0.88 0.87 196
accuracy 0.87 400
macro avg 0.87 0.87 0.87 400
weighted avg 0.87 0.87 0.87 400
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, precision_recall_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import joblib
from sklearn.metrics import classification_report
train_accuracy = []
test_accuracy = []
# Define the SVM model
svm_model = SVC(probability=True)
# Define the parameter grid for grid search
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'kernel': ['linear', 'rbf', 'poly']}
# Perform grid search
grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)
# Get the best model from grid search
best_svm_model = grid_search.best_estimator_
# Predictions on training data
y_train_pred_svm = best_svm_model.predict(x_train)
train_accuracy_svm = accuracy_score(y_train, y_train_pred_svm)
print("Training Accuracy of SVM Model:", train_accuracy_svm)
train_accuracy.append(train_accuracy_svm)
# Predictions on testing data
y_test_pred_svm = best_svm_model.predict(x_test)
accuracy_svm = accuracy_score(y_test, y_test_pred_svm)
print("Test Accuracy of SVM Model:", accuracy_svm)
test_accuracy.append(accuracy_svm)
accuracy.append(accuracy_svm)
print("Classification Report:")
print(classification_report(y_test, y_test_pred_svm))
# Save the trained model
joblib.dump(best_svm_model, 'best_svm_model.pkl')
# Confusion Matrix
conf_matrix_svm = confusion_matrix(y_test, y_test_pred_svm)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_svm, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()
# ROC Curve
y_test_probs_svm = best_svm_model.predict_proba(x_test)[:, 1]
fpr_svm, tpr_svm, thresholds_svm = roc_curve(y_test, y_test_probs_svm)
plt.plot(fpr_svm, tpr_svm, label='SVM (AUC = {:.2f})'.format(auc(fpr_svm, tpr_svm)))
plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
# Precision-Recall Curve
precision_svm, recall_svm, thresholds_svm = precision_recall_curve(y_test, y_test_probs_svm)
plt.plot(recall_svm, precision_svm, label='SVM')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()
# Learning Curve
train_sizes, train_scores, test_scores = learning_curve(best_svm_model, x_train, y_train, cv=5, train_sizes=np.linspace(0.1, 1.0, 10), scoring='accuracy')
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# Plotting Training Accuracy
axes[0].plot(train_sizes, train_mean, label='Training Accuracy', color='blue')
axes[0].fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color='blue', alpha=0.1)
axes[0].set_xlabel('Number of Training Examples')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Training Accuracy')
axes[0].legend()
# Plotting Testing Accuracy
axes[1].plot(train_sizes, test_mean, label='Testing Accuracy', color='red')
axes[1].fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color='red', alpha=0.1)
axes[1].set_xlabel('Number of Training Examples')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Testing Accuracy')
axes[1].legend()
plt.tight_layout()
plt.show()
Training Accuracy of SVM Model: 0.986875
Test Accuracy of SVM Model: 0.8575
Classification Report:
precision recall f1-score support
0 0.86 0.86 0.86 204
1 0.85 0.86 0.85 196
accuracy 0.86 400
macro avg 0.86 0.86 0.86 400
weighted avg 0.86 0.86 0.86 400
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, precision_recall_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import joblib
train_accuracy = []
test_accuracy = []
# Define the XGBoost model
xgb_model = XGBClassifier()
# Define the parameter grid for grid search
param_grid = {'learning_rate': [0.001, 0.01, 0.1, 1], 'n_estimators': [100, 200, 300], 'max_depth': [3, 5, 7]}
# Perform grid search
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)
# Get the best model from grid search
best_xgb_model = grid_search.best_estimator_
# Predictions on training data
y_train_pred_xgb = best_xgb_model.predict(x_train)
train_accuracy_xgb = accuracy_score(y_train, y_train_pred_xgb)
print("Training Accuracy of XGBoost Model:", train_accuracy_xgb)
train_accuracy.append(train_accuracy_xgb)
# Predictions on testing data
y_test_pred_xgb = best_xgb_model.predict(x_test)
accuracy_xgb = accuracy_score(y_test, y_test_pred_xgb)
print("Test Accuracy of XGBoost Model:", accuracy_xgb)
test_accuracy.append(accuracy_xgb)
accuracy.append(accuracy_xgb)
print("Classification Report:")
print(classification_report(y_test, y_test_pred_xgb))
# Save the trained model
joblib.dump(best_xgb_model, 'best_xgb_model.pkl')
# Confusion Matrix
conf_matrix_xgb = confusion_matrix(y_test, y_test_pred_xgb)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_xgb, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()
# ROC Curve
y_test_probs_xgb = best_xgb_model.predict_proba(x_test)[:, 1]
fpr_xgb, tpr_xgb, thresholds_xgb = roc_curve(y_test, y_test_probs_xgb)
plt.plot(fpr_xgb, tpr_xgb, label='XGBoost (AUC = {:.2f})'.format(auc(fpr_xgb, tpr_xgb)))
plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
# Precision-Recall Curve
precision_xgb, recall_xgb, thresholds_xgb = precision_recall_curve(y_test, y_test_probs_xgb)
plt.plot(recall_xgb, precision_xgb, label='XGBoost')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()
# Learning Curve
train_sizes, train_scores, test_scores = learning_curve(best_xgb_model, x_train, y_train, cv=5, train_sizes=np.linspace(0.1, 1.0, 10), scoring='accuracy')
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
train_sizes = np.linspace(0.1, 1.0, 10)
plt.figure(figsize=(8, 6))
# Plotting Testing Accuracy
plt.plot(train_sizes, test_mean, label='Testing Accuracy', color='red')
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color='red', alpha=0.1)
plt.xlabel('Number of Training Examples')
plt.ylabel('Accuracy')
plt.title('Testing Accuracy')
plt.legend()
plt.tight_layout()
plt.show()
Training Accuracy of XGBoost Model: 1.0
Test Accuracy of XGBoost Model: 0.7925
Classification Report:
precision recall f1-score support
0 0.80 0.80 0.80 204
1 0.79 0.79 0.79 196
accuracy 0.79 400
macro avg 0.79 0.79 0.79 400
weighted avg 0.79 0.79 0.79 400
from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, precision_recall_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import joblib
ensemble_model = RandomForestClassifier()
# Define the parameter grid for grid search
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 5, 10],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'max_features': ['auto', 'sqrt', 'log2']
}
# Perform grid search
grid_search = GridSearchCV(ensemble_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)
# Get the best model from grid search
best_ensemble_model = grid_search.best_estimator_
# Predictions on training data
y_train_pred_ensemble = best_ensemble_model.predict(x_train)
train_accuracy_ensemble = accuracy_score(y_train, y_train_pred_ensemble)
print("Training Accuracy of Ensemble Model:", train_accuracy_ensemble)
# Predictions on testing data
y_test_pred_ensemble = best_ensemble_model.predict(x_test)
accuracy_ensemble = accuracy_score(y_test, y_test_pred_ensemble)
print("Test Accuracy of Ensemble Model:", accuracy_ensemble)
accuracy.append(accuracy_ensemble)
print("Classification Report:")
print(classification_report(y_test, y_test_pred_ensemble))
# Save the trained model
joblib.dump(best_ensemble_model, 'best_ensemble_model.pkl')
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_test_pred_ensemble)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()
# ROC Curve
y_test_probs_ensemble = best_ensemble_model.predict_proba(x_test)[:, 1]
fpr_ensemble, tpr_ensemble, thresholds_ensemble = roc_curve(y_test, y_test_probs_ensemble)
plt.plot(fpr_ensemble, tpr_ensemble, label='Ensemble Model (AUC = {:.2f})'.format(auc(fpr_ensemble, tpr_ensemble)))
plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
# Precision-Recall Curve
precision_ensemble, recall_ensemble, thresholds_ensemble = precision_recall_curve(y_test, y_test_probs_ensemble)
plt.plot(recall_ensemble, precision_ensemble, label='Ensemble Model')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()
# Learning Curve
train_sizes, train_scores, test_scores = learning_curve(best_ensemble_model, x_train, y_train, cv=5, train_sizes=np.linspace(0.1, 1.0, 10), scoring='accuracy')
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.figure(figsize=(8, 6))
# Plotting Testing Accuracy
plt.plot(train_sizes, test_mean, label='Testing Accuracy', color='red')
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color='red', alpha=0.1)
plt.xlabel('Number of Training Examples')
plt.ylabel('Accuracy')
plt.title('Testing Accuracy')
plt.legend()
plt.tight_layout()
plt.show()
M:\Anaconda\Lib\site-packages\sklearn\model_selection\_validation.py:547: FitFailedWarning:
405 fits failed out of a total of 1215.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
405 fits failed with the following error:
Traceback (most recent call last):
File "M:\Anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "M:\Anaconda\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
estimator._validate_params()
File "M:\Anaconda\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
validate_parameter_constraints(
File "M:\Anaconda\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of RandomForestClassifier must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'log2', 'sqrt'} or None. Got 'auto' instead.
M:\Anaconda\Lib\site-packages\sklearn\model_selection\_search.py:1051: UserWarning:
One or more of the test scores are non-finite: [ nan nan nan nan nan nan nan nan
nan nan nan nan nan nan nan nan
nan nan nan nan nan nan nan nan
nan nan nan 0.75125 0.785625 0.798125 0.75375 0.79
0.811875 0.76375 0.78875 0.799375 0.756875 0.775625 0.81125 0.77
0.784375 0.805625 0.771875 0.779375 0.80375 0.766875 0.783125 0.79875
0.75625 0.7925 0.80125 0.760625 0.7825 0.80625 0.695625 0.720625
0.76125 0.69875 0.73625 0.768125 0.721875 0.745 0.76625 0.7075
0.74625 0.780625 0.735625 0.7625 0.77875 0.71125 0.7375 0.7675
0.681875 0.715625 0.7475 0.7025 0.729375 0.739375 0.68 0.73375
0.753125 nan nan nan nan nan nan nan
nan nan nan nan nan nan nan nan
nan nan nan nan nan nan nan nan
nan nan nan nan 0.741875 0.7925 0.798125 0.73375
0.77 0.7875 0.739375 0.768125 0.794375 0.740625 0.7725 0.7975
0.736875 0.76875 0.785625 0.7425 0.783125 0.798125 0.74625 0.766875
0.785625 0.745 0.7725 0.789375 0.74625 0.779375 0.7925 0.653125
0.674375 0.719375 0.65 0.66625 0.71 0.64125 0.65375 0.720625
0.640625 0.6975 0.713125 0.655625 0.681875 0.700625 0.648125 0.69375
0.71 0.6325 0.675625 0.721875 0.643125 0.6625 0.70125 0.630625
0.673125 0.716875 nan nan nan nan nan nan
nan nan nan nan nan nan nan nan
nan nan nan nan nan nan nan nan
nan nan nan nan nan 0.754375 0.781875 0.79
0.748125 0.785 0.7875 0.755 0.79375 0.798125 0.74625 0.781875
0.8 0.766875 0.78 0.79875 0.754375 0.7825 0.79375 0.75375
0.7825 0.800625 0.770625 0.78 0.799375 0.7625 0.78125 0.800625
0.681875 0.695 0.72375 0.68375 0.69375 0.745 0.6725 0.704375
0.735625 0.68625 0.713125 0.74625 0.68125 0.710625 0.74 0.66875
0.72125 0.736875 0.673125 0.7125 0.74125 0.654375 0.7 0.735
0.66625 0.713125 0.734375]
Training Accuracy of Ensemble Model: 1.0
Test Accuracy of Ensemble Model: 0.8025
Classification Report:
precision recall f1-score support
0 0.80 0.82 0.81 204
1 0.81 0.79 0.80 196
accuracy 0.80 400
macro avg 0.80 0.80 0.80 400
weighted avg 0.80 0.80 0.80 400
accuracy
[0.8675, 0.8675, 0.8575, 0.7925, 0.8025]
import matplotlib.pyplot as plt
# List of accuracy scores
accuracy_scores = [0.8575, 0.7925, 0.8675, 0.8025]
models = ['SVM', 'XGBoost', 'logistic regression', 'Random Forest']
plt.figure(figsize=(8, 6))
plt.bar(models, accuracy_scores, color='skyblue')
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title('Accuracy Scores of Different Models')
plt.ylim(0.7, 0.9) # Adjust the y-axis limits if needed
plt.show()
def preprocess_text(text):
# Tokenization
tokens = word_tokenize(text)
# Lowercasing, removing punctuation, and removing stopwords
stop_words = set(stopwords.words('english'))
processed_tokens = [token.lower() for token in tokens if token.lower() not in stop_words and token.lower() not in string.punctuation]
# Stemming
processed_tokens = [ps.stem(token) for token in processed_tokens]
# Removing special characters, emojis, and handling contractions
processed_tokens = [re.sub(r'[^a-zA-Z0-9\s]', '', token) for token in processed_tokens]
# Handling numerical data
processed_tokens = ['NUM' if token.isdigit() else token for token in processed_tokens]
# Joining Tokens
processed_text = ' '.join(processed_tokens)
return processed_text
def tokenizer(text):
return text.split()
def tokenizer_porter(text):
return [porter.stem(word) for word in text.split()]
test_positive_folder = 'M:/s8/NLP/project/NLP_Sentiment_Analysis/TestScript Dataset/positive'
test_negative_folder = 'M:/s8/NLP/project/NLP_Sentiment_Analysis/TestScript Dataset/negative'
test_texts = []
test_labels = []
test_textsNames = []
for filename in os.listdir(test_positive_folder):
test_textsNames.append(filename.split('.')[0])
with open(os.path.join(test_positive_folder, filename), 'r') as file:
text = file.read()
processed_text = preprocess_text(text)
test_texts.append(processed_text)
test_labels.append('pos')
for filename in os.listdir(test_negative_folder):
test_textsNames.append(filename.split('.')[0])
with open(os.path.join(test_negative_folder, filename), 'r') as file:
text = file.read()
processed_text = preprocess_text(text)
test_texts.append(processed_text)
test_labels.append('neg')
Model = joblib.load('best_lr_model.pkl')
tfidf_test = joblib.load('tfidf.pkl')
scaler_test = joblib.load('scaler.pkl')
label_encoder_test = joblib.load('label_encoder.pkl')
# applying tfidf on x
x_true = tfidf_test.transform(test_texts)
# applying normalization on x using MaxAbsScaler
x_true = scaler_test.transform(x_true)
# applying One-Hot Encoding on True y
y_true = label_encoder_test.transform(test_labels)
y_predicted = []
# Loop through each text in the data
for text_ in range(x_true.shape[0]):
# Predict
Prediction = Model.predict(x_true[text_])
y_predicted.append(Prediction)
if Prediction == 0:
print("Text ", test_textsNames[text_], " is Negative")
elif Prediction == 1:
print("Text ", test_textsNames[text_], " is Positive")
Accuracy = accuracy_score(y_true, y_predicted)
Precision, Recall, F1Score, _ = precision_recall_fscore_support(y_true, y_predicted, average='weighted')
print("Sentiment Prediction : ")
print("Accuracy : ", Accuracy)
print("Precision : ", Precision)
print("Recall : ", Recall)
print("F1-score : ", F1Score)
Text cv010_29198 is Positive Text cv015_29439 is Positive Text cv028_26964 is Negative Text cv030_22893 is Negative Sentiment Prediction : Accuracy : 1.0 Precision : 1.0 Recall : 1.0 F1-score : 1.0